PCAWG fits

Here we’ll plot the all the CCF histograms along with annotations of subclonal drivers. We present a subset of cases in Figure 1 and provide all the plots in supplementary figure 1. We’ll first read in the MAF ccf file we processed earlier.

library(data.table)
library(tidyverse)
maf_ccf <- fread("processed_data/maf_subclone_ccf.csv.gz")

maf_ccf <- maf_ccf %>% 
  mutate(histology_abbreviation = str_remove_all(histology_abbreviation, ".clearcell"))
library(ggrepel)
library(cowplot)
library(gtools)

CCF histograms

library(ggrepel)
library(cowplot)
library(gtools)

sample_to_idx <- distinct(maf_ccf, Tumor_Sample_Barcode, histology_abbreviation) %>% 
  arrange(histology_abbreviation) %>% 
  mutate(idx = paste0(round(row_number() / 100) * 100))
sample_ids_chunk <- split(sample_to_idx$Tumor_Sample_Barcode, sample_to_idx$idx)

for (ct in mixedsort(names(sample_ids_chunk))){
  cat(paste0('## ', as.numeric(ct) + 1, '-', as.numeric(ct) + 100, ' \n'))
  samples <- sample_ids_chunk[[ct]]
  
  dfdrivers <- maf_ccf %>% 
    filter(issubclonaldriver == TRUE) %>% 
    filter(Tumor_Sample_Barcode %in% samples)
  
  if (dim(dfdrivers)[1] > 0){
    g <- maf_ccf %>%
        filter(Tumor_Sample_Barcode %in% samples) %>% 
        ggplot(aes(x = ccf, fill = cluster)) +
        geom_histogram(bins = 100, position = "stack", alpha = 0.6) +
        facet_wrap(histology_abbreviation~icgc_sample_id, scales = "free_y") +
        geom_text_repel(data = dfdrivers,
                    aes(label = gene, y = Inf, x = ccf), col = "black",
                    alpha = 0.85, size = 2.0) +
        geom_vline(data=dfdrivers,
                   aes(xintercept = ccf, col = cluster), lty = 2, size = 0.25, show.legend = FALSE) +
        scale_color_manual(values = RColorBrewer::brewer.pal(6, "Dark2"), breaks = paste0("cluster", 1:6)) +
        scale_fill_manual(values = RColorBrewer::brewer.pal(6, "Dark2"), breaks = paste0("cluster", 1:6)) +
        theme_cowplot(font_size = 11) +
        scale_y_continuous(breaks = scales::pretty_breaks(n = 3)) +
        scale_x_continuous(breaks = c(0.0, 1.0, 2.0), limits = c(0,2)) +
        panel_border() +
        xlab("CCF") +
        ylab("Counts")
    print(g)
    } else {
      g <- maf_ccf %>% 
        filter(Tumor_Sample_Barcode %in% samples) %>% 
        ggplot(aes(x = ccf, fill = cluster)) +
        geom_histogram(bins = 100, position = "stack", alpha = 0.6) +
        facet_wrap(histology_abbreviation~icgc_sample_id, scales = "free_y") +
        scale_color_manual(values = RColorBrewer::brewer.pal(6, "Dark2"), breaks = paste0("cluster", 1:6)) +
        scale_fill_manual(values = RColorBrewer::brewer.pal(6, "Dark2"), breaks = paste0("cluster", 1:6)) +
        theme_cowplot(font_size = 11) +
        scale_y_continuous(breaks = scales::pretty_breaks(n = 3)) +
        scale_x_continuous(breaks = c(0.0, 1.0, 2.0), limits = c(0,2)) +
        panel_border() +
        xlab("CCF") +
        ylab("Counts")
      print(g)
    }
  cat(' \n \n')
}

1-100

101-200

201-300

301-400

401-500

501-600

601-700

701-800

801-900

901-1000

1001-1100

1101-1200

1201-1300

1301-1400

## png 
##   2

Figure 1

The ID’s for the 41 cases we include in Figure 1 are available in PCAWG_subclonal_drivers.csv. We’ll now pull these out and plot the CCF histograms. This is figure 1 in our paper.

md <- fread("metadata/PCAWG_subclonal_drivers.csv") %>% 
   filter(include == "Y")
samples_to_include <- md$Tumor_Sample_Barcode

dfdrivers <- maf_ccf %>% 
  mutate(cluster = str_remove(cluster, "cluster")) %>% 
  filter(issubclonaldriver == TRUE) %>% 
  filter(Tumor_Sample_Barcode %in% samples_to_include) %>% 
  arrange(Tumor_Sample_Barcode, ccf) %>% 
  group_by(Tumor_Sample_Barcode) %>% 
  mutate(x = row_number()) %>% 
  mutate(x = ifelse(x == 1, max(ccf - 0.4, 0.0), ccf + 0.3))

g <- maf_ccf %>%
    mutate(cluster = str_remove(cluster, "cluster")) %>% 
    filter(Tumor_Sample_Barcode %in% samples_to_include) %>% 
    ggplot(aes(x = ccf, fill = cluster)) +
    geom_histogram(bins = 100, position = "stack", alpha = 0.6, size = 0.0) +
    facet_wrap(~icgc_sample_id, scales = "free_y") +
    geom_text_repel(data = dfdrivers,
                    force = 1,
                    aes(label = gene, y = Inf, x = x), col = "black",
                    alpha = 0.85, size = 1.5) +
    geom_vline(data=dfdrivers,
               aes(xintercept = ccf, col = cluster), lty = 2, size = 0.25, show.legend = FALSE) +
    scale_color_manual(values = RColorBrewer::brewer.pal(6, "Dark2"), breaks = paste0(1:6)) +
    scale_fill_manual(values = RColorBrewer::brewer.pal(6, "Dark2"), breaks = paste0(1:6)) +
    theme_cowplot(font_size = 8, line_size = 0.25) +
    theme(legend.position = c(0.9, 0.075)) +
    scale_y_continuous(breaks = scales::pretty_breaks(n = 3)) +
    scale_x_continuous(breaks = c(0.0, 1.0, 2.0), limits = c(0,2)) +
    labs(fill = "Cluster") +
    guides(fill=guide_legend(nrow=2)) +
    #ggtitle(ct) +
    panel_border() +
    xlab("CCF") +
    ylab("Counts")
g
Figure 1: 41 cases with subclonal drivers

Figure 1: 41 cases with subclonal drivers

save_plot(filename = "Figure1.png", plot = g, base_height = 5, base_width = 7)
save_plot(filename = "Figure1.pdf", plot = g, base_height = 5, base_width = 7)

For these 41 cases we’ll also pull out the clustering probabilities for each cluster, which is used in one of our supplementary figures.

library(glue)
dfclust <- data.frame()
for (sa in samples_to_include){
  clust_assignments <- fread(glue("subclones_data/{sa}_cluster_assignments.txt.gz"))[mut_type != "SV"] %>% 
    mutate(sample = sa) %>% 
    select(sample, everything(), -chromosome2, -position2)
  dfclust <- bind_rows(dfclust, clust_assignments)
}
fwrite(dfclust, "processed_data/cluster_probs_41cases.csv.gz")